#importing the numpy library
import numpy as np
# for dataframe manipulations
import pandas as pd
# for Data Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
# for data analysis
import dabl
#importing the dataset
data = pd.read_csv('R:\customer-data-marketing-campaign.csv')
#checking the shape of the dataset
print('Shape of the dataset: ', data.shape)
Shape of the dataset: (2240, 29)
#checking the head of the data
data.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 04-09-2012 | 58 | 635 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 08-03-2014 | 38 | 11 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 21-08-2013 | 26 | 426 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 10-02-2014 | 26 | 11 | ... | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 19-01-2014 | 94 | 173 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
5 rows × 29 columns
#getting the pairplot of the data
sns.pairplot(data)
plt.show()
#Correlation heatmap for the data
sns.heatmap(data.corr(), annot = True, cmap = 'copper')
plt.title('Correlation Heatmap for Customer Data', fontsize = 16)
plt.show()
#analyzing score with respect to income
dabl.plot(data, target_col = 'Income')
Target looks like regression
[<AxesSubplot: title={'center': 'Target distribution'}, xlabel='Income', ylabel='frequency'>,
array([[<AxesSubplot: title={'center': 'F=8.43E-01'}, xlabel='MntWines', ylabel='Income'>,
<AxesSubplot: title={'center': 'F=8.25E-01'}, xlabel='MntMeatProducts'>,
<AxesSubplot: title={'center': 'F=7.99E-01'}, xlabel='NumCatalogPurchases (jittered)'>,
<AxesSubplot: title={'center': 'F=7.50E-01'}, xlabel='NumStorePurchases (jittered)'>,
<AxesSubplot: title={'center': 'F=-6.41E-01'}, xlabel='NumWebVisitsMonth (jittered)'>],
[<AxesSubplot: title={'center': 'F=5.91E-01'}, xlabel='MntFruits', ylabel='Income'>,
<AxesSubplot: title={'center': 'F=5.88E-01'}, xlabel='NumWebPurchases (jittered)'>,
<AxesSubplot: title={'center': 'F=5.87E-01'}, xlabel='MntFishProducts'>,
<AxesSubplot: title={'center': 'F=5.78E-01'}, xlabel='MntSweetProducts'>,
<AxesSubplot: title={'center': 'F=5.19E-01'}, xlabel='MntGoldProds'>],
[<AxesSubplot: title={'center': 'F=-2.21E-01'}, xlabel='Year_Birth', ylabel='Income'>,
<AxesSubplot: title={'center': 'F=-1.95E-01'}, xlabel='NumDealsPurchases (jittered)'>,
<AxesSubplot: title={'center': 'F=8.11E-03'}, xlabel='Recency'>,
<AxesSubplot: title={'center': 'F=3.48E-03'}, xlabel='ID'>,
<AxesSubplot: >]], dtype=object),
array([[<AxesSubplot: title={'center': 'F=2.18E-01'}, xlabel='Income', ylabel='Education'>,
<AxesSubplot: title={'center': 'F=1.77E-01'}, xlabel='Income', ylabel='Marital_Status'>,
<AxesSubplot: title={'center': 'F=1.29E-01'}, xlabel='Income', ylabel='Kidhome'>],
[<AxesSubplot: title={'center': 'F=1.21E-01'}, xlabel='Income', ylabel='Teenhome'>,
<AxesSubplot: title={'center': 'F=8.29E-02'}, xlabel='Income', ylabel='AcceptedCmp3'>,
<AxesSubplot: title={'center': 'F=7.32E-02'}, xlabel='Income', ylabel='AcceptedCmp4'>],
[<AxesSubplot: title={'center': 'F=3.80E-02'}, xlabel='Income', ylabel='AcceptedCmp5'>,
<AxesSubplot: title={'center': 'F=3.65E-02'}, xlabel='Income', ylabel='AcceptedCmp1'>,
<AxesSubplot: title={'center': 'F=1.82E-02'}, xlabel='Income', ylabel='Response'>]],
dtype=object)]
#analyzing the data with respect to Education
dabl.plot(data, target_col = 'Education')
Target looks like classification Linear Discriminant Analysis training set score: 0.246
[[<Figure size 2000x1500 with 15 Axes>, <Figure size 1600x400 with 4 Axes>, <Figure size 1600x400 with 4 Axes>, <Figure size 1600x400 with 4 Axes>], None]
# describing the data
data.describe()
| ID | Year_Birth | Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2240.000000 | 2240.000000 | 2216.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | ... | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.0 | 2240.0 | 2240.000000 |
| mean | 5592.159821 | 1968.805804 | 52247.251354 | 0.444196 | 0.506250 | 49.109375 | 303.935714 | 26.302232 | 166.950000 | 37.525446 | ... | 5.316518 | 0.072768 | 0.074554 | 0.072768 | 0.064286 | 0.013393 | 0.009375 | 3.0 | 11.0 | 0.149107 |
| std | 3246.662198 | 11.984069 | 25173.076661 | 0.538398 | 0.544538 | 28.962453 | 336.597393 | 39.773434 | 225.715373 | 54.628979 | ... | 2.426645 | 0.259813 | 0.262728 | 0.259813 | 0.245316 | 0.114976 | 0.096391 | 0.0 | 0.0 | 0.356274 |
| min | 0.000000 | 1893.000000 | 1730.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 25% | 2828.250000 | 1959.000000 | 35303.000000 | 0.000000 | 0.000000 | 24.000000 | 23.750000 | 1.000000 | 16.000000 | 3.000000 | ... | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 50% | 5458.500000 | 1970.000000 | 51381.500000 | 0.000000 | 0.000000 | 49.000000 | 173.500000 | 8.000000 | 67.000000 | 12.000000 | ... | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 75% | 8427.750000 | 1977.000000 | 68522.000000 | 1.000000 | 1.000000 | 74.000000 | 504.250000 | 33.000000 | 232.000000 | 50.000000 | ... | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| max | 11191.000000 | 1996.000000 | 666666.000000 | 2.000000 | 2.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | ... | 20.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3.0 | 11.0 | 1.000000 |
8 rows × 26 columns
# describing the categorical data
data.describe(include = 'object')
| Education | Marital_Status | Dt_Customer | |
|---|---|---|---|
| count | 2240 | 2240 | 2240 |
| unique | 5 | 8 | 663 |
| top | Graduation | Married | 31-08-2012 |
| freq | 1127 | 864 | 12 |
# checking if there is any NULL data
data.isnull().any().any()
False
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (18, 8)
plt.subplot(1, 2, 1)
sns.set(style = 'whitegrid')
sns.distplot(data['Income'])
plt.title('Distribution of Income', fontsize = 20)
plt.xlabel('Range of Income')
plt.ylabel('Count')
plt.subplot(1, 2, 2)
sns.set(style = 'whitegrid')
sns.distplot(data['Recency'], color = 'red')
plt.title('Distribution of Recency', fontsize = 20)
plt.xlabel('Recency')
plt.ylabel('Count')
plt.show()
labels = ['Graduation', 'PhD', 'Master', '2n Cycle', 'Basic']
size = data['Education'].value_counts()
colors = ['yellow', 'green', 'orange', 'blue', 'red']
plt.rcParams['figure.figsize'] = (9, 9)
plt.pie(size, colors = colors,labels = labels, shadow = True, startangle = 90, autopct = '%.2f%%')
plt.title('Education type', fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()
# checking the sitribution of Income
plt.rcParams['figure.figsize'] = (30, 12)
sns.distplot(data['Income'], color = 'red')
plt.title('Distribution of Income', fontsize = 20)
plt.show()
#checking the distribution of Amount spent on wines
plt.rcParams['figure.figsize'] = (30, 12)
sns.distplot(data['MntWines'], color = 'blue')
plt.title('Distribution of MntWines', fontsize = 20)
plt.show()
# Education vs Income
plt.rcParams['figure.figsize'] = (18, 10)
sns.boxenplot(data = data, x = 'Education', y = 'Income', palette = 'Blues')
plt.title('Education vs Income', fontsize = 20)
plt.show()
#Education vs Income
plt.rcParams['figure.figsize'] = (18, 7)
sns.violinplot(data = data, x = 'Education', y = 'Income', palette = 'rainbow')
plt.title('Education vs Income', fontsize = 20)
plt.show()
# Education vs Income
plt.rcParams['figure.figsize'] = (18, 7)
sns.stripplot(data = data, x = 'Education', y = 'Income', palette = 'Purples', size = 10)
plt.title('Education vs Income', fontsize = 20)
plt.show()
# Income vs Year of birth
sns.lineplot(data = data, x = 'Income', y = 'Year_Birth', color = 'blue')
plt.title('Income vs Year of birth', fontsize = 20)
plt.show()
#Income vs Education
sns.lineplot(data = data, x = 'Income', y = 'Education', color = 'pink')
plt.title('Income vs Education', fontsize = 20)
plt.show()
# Income, and Kidhome
import warnings
warnings.filterwarnings('ignore')
# selecting the Income, and Kidhome Columns from the Data
x = data.loc[:, ['Income', 'Kidhome']].values
# checking the shape of x
print(x.shape)
(2240, 2)
# Checking the data, which we are going to use for the clustering analysis
x_data = pd.DataFrame(x)
x_data.head()
# where o->Income, and 1->Kidhome
| 0 | 1 | |
|---|---|---|
| 0 | 58138 | 0 |
| 1 | 46344 | 1 |
| 2 | 71613 | 0 |
| 3 | 26646 | 1 |
| 4 | 58293 | 1 |
#The Elbow Method to find the No. of Optimal Clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
km = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
km.fit(x)
wcss.append(km.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method', fontsize = 20)
plt.xlabel('No. of Clusters')
plt.ylabel('wcss')
plt.show()
#Visualizing the Clusters
plt.style.use('fivethirtyeight')
km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(x)
plt.scatter(x[y_means == 0, 0], x[y_means == 0, 1], s = 100, c = 'pink', label = 'general')
plt.scatter(x[y_means == 1, 0], x[y_means == 1, 1], s = 100, c = 'yellow', label = 'miser')
plt.scatter(x[y_means == 2, 0], x[y_means == 2, 1], s = 100, c = 'cyan', label = 'target')
plt.scatter(x[y_means == 3, 0], x[y_means == 3, 1], s = 100, c = 'magenta', label = 'careful')
plt.scatter(x[y_means == 4, 0], x[y_means == 4, 1], s = 100, c = 'orange', label = 'spendthrift')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], s = 50, c = 'blue' , label = 'centeroid')
plt.style.use('fivethirtyeight')
plt.title('K Means Clustering between Income and Kids in the home', fontsize = 20)
plt.xlabel('Income')
plt.ylabel('Kidhome')
plt.legend()
plt.grid()
plt.show()
#Clustering between and Spending Score
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
kmeans.fit(x)
wcss.append(kmeans.inertia_)
plt.rcParams['figure.figsize'] = (15, 5)
plt.plot(range(1, 11), wcss)
plt.title('K-Means Clustering(The Elbow Method)', fontsize = 20)
plt.xlabel('Kidhome')
plt.ylabel('Income')
plt.grid()
plt.show()
kmeans = KMeans(n_clusters = 4, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
ymeans = kmeans.fit_predict(x)
plt.rcParams['figure.figsize'] = (10, 10)
plt.title('Cluster of Ages', fontsize = 30)
plt.scatter(x[ymeans == 0, 0], x[ymeans == 0, 1], s = 100, c = 'pink', )
plt.scatter(x[ymeans == 1, 0], x[ymeans == 1, 1], s = 100, c = 'orange',)
plt.scatter(x[ymeans == 2, 0], x[ymeans == 2, 1], s = 100, c = 'lightgreen',)
plt.scatter(x[ymeans == 3, 0], x[ymeans == 3, 1], s = 100, c = 'red')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 50, c = 'black')
plt.style.use('fivethirtyeight')
plt.xlabel('Income')
plt.ylabel('Kidhome')
plt.grid()
plt.show()